Load packages and import data

knitr::opts_chunk$set(echo = TRUE)
library(assertthat)
library(ggplot2)
library(dplyr)
library(tidyr)
library(PopGenome)
library(stringr)
library(data.table)
library(magrittr)
library(wrapr)
library(foreach)

ms2 = read.table("~/Documents/Research/PloidySim/data/mssel_out_2-24-19_fin.txt", head = T)

#Group windows into larger sets
ms2 %<>% mutate(ints = as.numeric(as.character(cut(bp.start, breaks = 50, labels = seq(1,50)))))

Background

Dominance

This plot shows the dominance value of each genotype (given by within individual allele frequency) as a function of the dominance scalar. The purpose was to write a function that would take a single dominance value that could be specified for any ploidy and return a comparable set of dominance coefficients for heterozygous genotypes.

getDomCoefs = function(dominance, ploidy = 100){
  if (dominance == 0){
    coeffs = rep(0, ploidy - 1)
  }
  else if (dominance == 1){
    coeffs = rep(1, ploidy - 1)
  }
  else {
    dom = (1 - dominance) - 0.5
    dom = abs((dom * 10 + sign(dom)) ^ sign(dom))
    coeffs = (seq(1, ploidy - 1) / ploidy) ^ dom
  }
  return(coeffs)
}

doms = c(seq(0,1, 0.1))
Doms = as.data.frame(sapply(doms, getDomCoefs))
Doms = rbind(c(0,0,0,0,0,0,0), Doms, c(1,1,1,1,1,1,1))
colnames(Doms) = doms
Doms = cbind(Doms, data.frame("i/k"=seq(0,100) / 100))
mDoms = melt(Doms, id.var = "i.k")
ggplot(mDoms, aes(x = i.k, y = value, color = variable)) + geom_line() + scale_color_discrete(name = "Dominance\nScalar") + theme_bw() + xlab("i / k") + ylab("Dominance Coefficient")

# Example

# dominance=c(seq(0.1,0.9,0.1))
# > dominance
# [1] 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9

# Centers around 0.5 (additivity)
# dom = (1 - dominance) - 0.5
# > dom
# [1]  0.4  0.3  0.2  0.1  0.0 -0.1 -0.2 -0.3 -0.4

# dom1 = abs((dom * 10 + sign(dom)) ^ sign(dom))
# > dom1
# [1] 5.0000000 4.0000000 3.0000000 2.0000000 1.0000000 0.5000000 0.3333333 0.2500000 0.2000000

# > getDomCoefs(dominance = 0.2, ploidy = 4)
# [1] 0.00390625 0.06250000 0.31640625
# > getDomCoefs(dominance = 0.5, ploidy = 4)
# [1] 0.25 0.50 0.75

Sweep trajectories

These plots show a primary motivating result for this study, in that higher ploidies show increased fixation times of sweeping beneficial alleles regardless of the dominance scenario being evaluated.

#Function creates trajectories
getTraj = function(s, dom, ploidy, start_freq, N = -9, end_freq = 0.99, maxGens = 9999, maxTries = 10){
  dom_coeffs = getDomCoefs(dom, ploidy)
  attempts = 0
  p = start_freq
  k = ploidy
  gen = 0
  while (attempts <= maxTries & p < end_freq & gen <= maxGens){
    df = data.frame("s" = s, "dom" = dom, "gen" = 0, "freq" = start_freq, "dp1" = 0, "w.bar" = 0, "ploidy" = ploidy)
    p = start_freq
    dp1 = 0
    gen = 0
    het_fits = 1 + dom_coeffs * s
    fits = c(1, het_fits, 1 + s) / (1 + s)
    while (p < end_freq & p > 0 & gen <= maxGens){
      q = 1 - p
      i = seq(0, k)
      Num = sum(choose(k, i) * ((k - i) / k) * (p ^ (k - i)) * (q ^ i) * rev(fits))
      Den = sum(choose(k, i) * (p ^ (k - i)) * (q ^ i) * rev(fits))
      p_prime = Num / Den
      if (N != -9){
        p_prime = sum(rbinom(N, k, p_prime)) / (k * N) 
      }
      df = rbind(df, c(s, dom, gen, p, dp1, Den, k))
      dp1 = p_prime - p
      p = p_prime
      gen = gen + 1
    }
    df = rbind(df, c(s, dom, gen, p, dp1, Den, k))
    attempts = attempts + 1
  }
  if (attempts > maxTries & nrow(df) <= maxGens){
    print(paste("Beneficial allele was lost due to drift for", maxTries, "consecutive attempts"))
  }
  if (nrow(df) > maxGens){
    print(paste("Beneficial allele did not fix before the maximum number of generations (", maxGens, ")."))
  }
  return(df[-1,])
}

# Param set to create trajectories for
params = data.frame("s" = rep(0.1,9), "dom" = c(rep(0.5,3), rep(0,3), rep(1,3)), "ploidy" = rep(c(2,4,8),3), "start" = rep(0.05, 9))

# Create trajectories
dat1 = foreach(i = 1:nrow(params), .combine=rbind) %do% {
  getTraj(params[i,]$s, params[i,]$dom, params[i,]$ploidy, params[i,]$start)
}
## [1] "Beneficial allele did not fix before the maximum number of generations ( 9999 )."
## [1] "Beneficial allele did not fix before the maximum number of generations ( 9999 )."
## [1] "Beneficial allele did not fix before the maximum number of generations ( 9999 )."
## [1] "Beneficial allele did not fix before the maximum number of generations ( 9999 )."
#plot
ggplot(dat1, aes(x=gen, y=freq, linetype=as.factor(dom), color=as.factor(ploidy))) + geom_line() + scale_color_discrete(name="Ploidy") + scale_linetype_discrete(name="Dominance Scalar", labels=c("0.1", "0.5", "0.9")) + xlim(0,300) + theme_bw() + xlab("Generation") + ylab("Allele Frequency")
## Warning: Removed 40024 rows containing missing values (geom_path).

Fitness

First plot shows reason why allele frequency change is faster in lower ploidies; increased variance in fitness at any given frequency. The second plot just shows mean fitness as a function of allele frequency and is not terribly interesting.

getFits = function(s, dom, ploidy){
  dom_coeffs = getDomCoefs(dom, ploidy)
  k = ploidy
  dat = foreach(i = 1:51, .combine=rbind) %do% {
    p = seq(0,1,1/50)[i]
    q = 1 - p
    het_fits = 1 + dom_coeffs * s
    fits = c(1, het_fits, 1 + s) / (1 + s)
    i = seq(0, k)
    freqs = choose(k, i) * (p ^ (k - i)) * (q ^ i)
    w.bar = sum(freqs * rev(fits))
    var.w = sum((freqs * (rev(fits) - w.bar) ^ 2) / length(freqs))
    data.frame('s' = s, 'dom' = dom, 'ploidy' = ploidy, 'freq' = p, 'w.bar' = w.bar, 'var.w' = var.w)
  }
  return(dat)
}

dat2 = foreach(i = 1:nrow(params), .combine=rbind) %do% {
  getFits(params[i,]$s, params[i,]$dom, params[i,]$ploidy)
}

ggplot(dat2, aes(x=freq, y=var.w, color = as.factor(ploidy), linetype=as.factor(dom))) + geom_line() + scale_color_discrete(name="Ploidy") + scale_linetype_discrete(name="Dominance", labels=c("0.1","0.5","0.9")) + xlab("Allele Frequency") + ylab("Variance in Fitness") + theme_bw()

ggplot(dat2, aes(x=freq, y=w.bar, color = as.factor(ploidy), linetype=as.factor(dom))) + geom_line() + scale_color_discrete(name="Ploidy") + scale_linetype_discrete(name="Dominance", labels=c("0.1","0.5","0.9")) + xlab("Allele Frequency") + ylab("Mean Fitness") + theme_bw()


mssel analysis

Variable key

  • N = Population size

  • s = selection coefficient; difference in fitness between alternative homozygotes

  • dom = dominance scalar; a single value that determines the dominance coefficients of heterozygous genotypes for an arbitrary ploidy

  • recomb = per-base recombination rate

  • mu = per-base mutation rate. Theta is ploidy x mu x N (x sequence length which is 1Mb)

  • sim_ploidy = Simulation Ploidy; ploidy used for mssel simulation. Value used in calculating theta above

  • traj_ploidy = Trajectory Ploidy; ploidy used to generate sweep trajectory. see getTraj() function in PloidyHitch.R

  • sampGen = sampling generation; number of generations passed since selection ends (i.e. fixation of beneficial allele)

  • fuseGen = fuse generation; number of generations prior to start of sweep that the two populations fused.

Simulation Ploidy and Trajectory Ploidy are the two focal variables. The former refers to the ploidy value used to scale population parameters when running mssel. For example, theta = 2(N)(mu)(sim_ploidy). The latter refers to the ploidy used for simulating the sweep trajectory, which is subsequently passed to mssel.

Summary of depth and breadth of peaks

Controlling for number of chromosomes in the population (i.e. Simulation Ploidy), higher ploidies have a shallower dip, due to the slowed rate of fixation of the sweeping allele. Similarly, the dips in diversity are narrower in higher ploidies for same reason as before.

Depth

MS2.a = ms2 %>% group_by(s, dom, recomb, N, mu, sim_ploidy, traj_ploidy, sampGen, fuseGen, rep) %>% summarize(depth = (mean(Pi.2,na.rm=T) - min(Pi.1)) / mean(Pi.2,na.rm=T))

MS2.a %>% filter(fuseGen == 1 & mu==1e-08 & N == 10000 & recomb=="1e-08" & s =="0.01") %>% ggplot(., aes(y=-(log(depth)), fill = as.factor(traj_ploidy), x = as.factor(sim_ploidy))) + geom_boxplot() + ylab(expression(paste("Dip Depth = min(", pi[1], ") / ",pi[2],sep = ""))) + facet_grid(~sampGen) + scale_y_log10() + scale_fill_discrete(name = "Trajectory\nPloidy") + xlab("Simulation Ploidy") + theme_bw()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 341 rows containing non-finite values (stat_boxplot).

Breadth

MS2.b = ms2 %>% group_by(s, dom, recomb, N, mu, sim_ploidy, traj_ploidy, sampGen, fuseGen, rep, ints) %>% summarize(lessWind = ifelse(mean(Pi.1, na.rm=T) < (mean(Pi.2, na.rm=T) / 2),1,0))

MS2.c = MS2.b %>% group_by(s, dom, recomb, N, mu, sim_ploidy, traj_ploidy, sampGen, fuseGen, rep) %>% summarize(breadth = sum(lessWind) * 20000)

MS2.c %>% filter(fuseGen == 1 & mu==1e-08 & N == 10000 & recomb=="1e-08" & s =="0.01" & sampGen == 1) %>% ggplot(., aes(y=breadth/1000, fill = as.factor(traj_ploidy), x = as.factor(sim_ploidy))) + geom_boxplot(notch=T) + ylab(expression(paste("Dip Breadth = Length (kb) of which ", pi[1], " < (",pi[2], " / 2)",sep = ""))) + scale_fill_discrete(name = "Trajectory\nPloidy") + xlab("Simulation Ploidy") + theme_bw()

Decay

Does breadth of peak decay more quickly for a given simulation ploidy

MS2.c %>% filter(fuseGen == 1 & mu==1e-08 & N == 10000 & recomb=="1e-08" & s =="0.01") %>% ggplot(., aes(y=breadth/1000, fill = as.factor(sampGen), x = as.factor(sim_ploidy))) + geom_boxplot(notch=T) + ylab(expression(paste("Dip Breadth = Length (1 kb) of which ", pi[1], " < (",pi[2], " / 2)",sep = ""))) + scale_y_log10() + scale_fill_discrete(name = "Generations prior\nto fixation") + xlab("Simulation Ploidy") + theme_bw()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 59 rows containing non-finite values (stat_boxplot).

Diversity plots

Trajectory effect

Here, we are keeping fixed the ploidy specified for the coalescent simulations (fixed at 8 in this example), and we are varying the ploidy under which the sweep trajectories are simulated. For some parameter sets, higher ploidies and thus slower sweeps results in a visible difference in terms of the reduction of diversity at and around the beneficial allele. The effect is more pronounced for weaker selection and for partial dominance/recessivity. The breadth of the dips is also notably lesser for higher ploidies. Thus, if chromosome number in a population is held constant, higher ploidies exhibit reduced signals associated with selective sweeps.

ms2 %>% filter(dom %in% c("0.1","0.9","0.5") & N == 10000 & fuseGen == 1 & sampGen ==1 & mu==1e-08 & sim_ploidy == 8) %>% ggplot(., aes(y=Pi.1 * 100, color = as.factor(traj_ploidy), x = mid/100000)) + geom_smooth(method="loess", size =0.5, span=0.1, se=F) + scale_x_continuous(breaks=c(1,5,9)) + xlab("Position (100 kb)") + ylab("Diversity (%)") + scale_color_discrete(name="Trajectory\nPloidy") + facet_grid(vars(s, recomb), vars(dom),labeller = label_both, scales = "free_y") + theme_bw()

Factor-of-2 effect

This is the complement to the previous figure. We use the same sweep trajectory, but we vary the ploidy specified for the coalescent simulation.

ms2 %>% filter(dom %in% c("0.1","0.9","0.5") & fuseGen == 1 & sampGen ==1 & mu==1e-08 & N == 10000 & traj_ploidy == 4) %>% ggplot(., aes(y=Pi.1 * 100, color = as.factor(sim_ploidy), x = mid/100000)) + geom_smooth(method="loess", size =0.5, span=0.1, se=F) + scale_x_continuous(breaks=c(1,5,9)) + scale_y_continuous(breaks=c(0.05,0.15)) + xlab("Position (100 kb)") + ylab("Diversity (%)") + scale_color_discrete(name="Simulation\nPloidy") + facet_grid(vars(recomb,s), vars(dom),labeller = label_both) + theme_bw()

Combined effects

Higher ploidies result in deeper and broader dips in diversity relative to diploids. The effect of recombination rate is not striking.

ms2 %>% filter(dom %in% c("0.1","0.9","0.5") & fuseGen == 1 & sampGen ==1 & mu==1e-08 & N == 10000 & sim_ploidy == traj_ploidy) %>% ggplot(., aes(y=Pi.1 * 100, color = as.factor(traj_ploidy), x = mid/100000)) + geom_smooth(method="loess", size =0.5, span=0.1, se=F) + scale_x_continuous(breaks=c(1,5,9)) + scale_y_continuous(breaks=c(0.05,0.15)) + xlab("Position (100 kb)") + ylab("Diversity (%)") + scale_color_discrete(name="Ploidy") + facet_grid(vars(recomb,s), vars(dom),labeller = label_both) + theme_bw()

Effect of trajectory ploidy, simulation ploidy, and sample generation on classic selection metrics

Tajima’s D

Polyploids recover new mutations more quickly, but they take longer to drift to higher frequencies. ThetaW goes up but ThetaPi lags…

# ms2 %>% filter(dom %in% c("0.1","0.9","0.5") & s==0.01 & fuseGen == 1 & mu==1e-08 & N == 10000 & sim_ploidy == traj_ploidy & recomb == 1e-08) %>% ggplot(., aes(y=Tajima.D_1, color = as.factor(sim_ploidy), x = mid/100000)) + geom_smooth(method="loess", size =0.5, span=0.1, se=F) + scale_x_continuous(breaks=c(1,5,9)) + xlab("Position (100 kb)") + ylab("Tajima's D") + scale_color_discrete(name="Ploidy") + facet_grid(vars(sampGen), vars(dom),labeller = label_both) + theme_bw()

ms2 %>% filter(dom == "0.5" & s==0.01 & fuseGen == 1 & mu==1e-08 & N == 10000 & recomb == 1e-08) %>% ggplot(., aes(y=Tajima.D_1, color = as.factor(sim_ploidy), x = mid/100000)) + geom_smooth(method="loess", size =0.5, span=0.1, se=F) + scale_x_continuous(breaks=c(1,5,9)) + xlab("Position (100 kb)") + ylab("Tajima's D") + scale_color_discrete(name="Ploidy") + facet_grid(vars(sampGen), vars(traj_ploidy),labeller = label_both) + theme_bw()
## Warning: Removed 204 rows containing non-finite values (stat_smooth).

Fst

Neutral divergence in allele frequencies increases Fst more quickly in lower ploidies, thus erasing ‘outlier’ signal due to sweep

ms2 %>% filter(dom %in% c("0.1","0.9","0.5") & s==0.01 & fuseGen == 1 & mu==1e-08 & N == 10000 & sim_ploidy == traj_ploidy & recomb == 1e-08) %>% ggplot(., aes(y=fst, color = as.factor(sim_ploidy), x = mid/100000)) + geom_smooth(method="loess", size =0.5, span=0.1, se=F) + scale_x_continuous(breaks=c(1,5,9)) + xlab("Position (100 kb)") + ylab("Fst") + scale_color_discrete(name="Ploidy") + facet_grid(vars(sampGen), vars(dom),labeller = label_both) + theme_bw()


Analysis of rehh calculations

This R package calculates EHH (Extended Haplotype Homozygosity) and related metrics, which can be used to detect a selective sweep. I’m showing results for the XP-EHH, the cross-populatin EHH.

Link to Article

Import Data

traj.ehh = read.csv("~/Documents/Research/PloidySim/rehh/rehh_calcs_sp2_se0.01.csv")
fact.ehh = read.csv("~/Documents/Research/PloidySim/rehh/rehh_calcs_do0.5_fG1_DS.csv")

Trajectory effect

Faster sweeps in the lower ploidies results in stronger signals of selection.

traj.ehh %>% filter(fuseGen=="fG1" & sampGen=="sG1" & recomb == "re1e-08" & dom=="do0.5" & N=="NN10000") %>% ggplot(.,aes(x=POSITION / 100000,y=xpehh.p,color=traj_ploidy)) + geom_smooth(method="loess", span=0.1, se = F) + scale_color_discrete(name="Trajectory\nPloidy", breaks=c("tp2","tp4","tp8"), labels=c("2","4","8")) + xlab("Position (100 kb)") + ylab(expression("-log"[10]*"(p-value)")) + theme_bw()

Factor-of-two effect

Trajectory ploidy is constant within each panel, while simulation ploidy is varied. Higher diversity in higher ploidies produces stronger signal due to greater deviation from baseline genomic diversity.

Something strange here though…why is p value always highest when tp=sp? Maybe look at density of NA along chromosomes?

fact.ehh %>% filter(s == "se0.01" & fuseGen=="fG1" & recomb == "re1e-08" & dom=="do0.5" & N=="NN10000") %>% ggplot(.,aes(x=POSITION / 100000, y=xpehh.p, color=sim_ploidy)) + geom_smooth(method="loess", span=0.1, se = F) + scale_color_discrete(name="Simulation\nPloidy", breaks=c("sp2","sp4","sp8"), labels=c("2","4","8"))  + facet_grid(cols = vars(traj_ploidy), rows = vars(sampGen))  + xlab("Position (100 kb)") + ylab(expression("-log"[10]*"(p-value)")) + theme_bw()
## Warning: Removed 78775 rows containing non-finite values (stat_smooth).

Sample generation effect

Does signal decay faster in higher or lower ploidies? Answer seems to be that decay is faster in lower ploidies. Drift is faster in lower ploidies, which is primary mechanism that erases signal of selection.

fact.ehh %>% filter(s == "se0.01" & fuseGen=="fG1" & dom=="do0.5" & N=="NN10000" & recomb=="re1e-08") %>% ggplot(.,aes(x=POSITION,y=xpehh.p,color=sim_ploidy)) + geom_smooth() + facet_grid(rows = vars(traj_ploidy), cols = vars(sampGen))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 78775 rows containing non-finite values (stat_smooth).


Less interesting analyses

Effect of sample generation

Sample generation is the number of generations following a sweep at which sampling occurs

ms2 %>% filter(dom %in% c("0.1","0.9","0.5") & s==0.01 & fuseGen == 1 & mu==1e-08 & N == 10000 & recomb == 5e-08 & sim_ploidy == traj_ploidy) %>% ggplot(., aes(y=Pi.1 * 100, color = as.factor(sim_ploidy), x = mid/100000)) + geom_smooth(method="loess", size =0.5, span=0.1, se=F) + scale_x_continuous(breaks=c(1,5,9)) + scale_y_continuous(breaks=c(0.05,0.15)) + xlab("Position (100 kb)") + ylab("Diversity (%)") + scale_color_discrete(name="Simulation\nPloidy") + facet_grid(vars(sampGen), vars(dom),labeller = label_both) + theme_bw()

Effect of population size

Nothing really interesting here…Smaller population size seems noisier as expected

ms2 %>% filter(dom %in% c("0.1","0.9","0.5") & recomb=="1e-08" & fuseGen == 1 & sampGen ==1 & mu==1e-08 & sim_ploidy == traj_ploidy) %>% ggplot(., aes(y=Pi.1 * 100, color = as.factor(traj_ploidy), x = mid/100000)) + geom_smooth(method="loess", size =0.5, span=0.1, se=F) + scale_x_continuous(breaks=c(1,5,9)) + xlab("Position (100 kb)") + ylab("Diversity (%)") + scale_color_discrete(name="Ploidy") + facet_grid(vars(s, N), vars(dom),labeller = label_both, scales = "free_y") + theme_bw()

Other plots

fact.ehh %>% filter(traj_ploidy == "tp4" & fuseGen=="fG1" & dom=="do0.5" & N=="NN10000" & recomb=="re1e-08" & sampGen=="sG1") %>% ggplot(.,aes(x=cut_interval(x=POSITION, n=50),y=xpehh.p,fill=sim_ploidy)) + geom_boxplot() + facet_grid(cols = vars(sampGen))
## Warning: Removed 20150 rows containing non-finite values (stat_boxplot).

# # Look for proportional difference in breadth/depth within a simul
MS2.a %>% filter(fuseGen == 1 & mu==1e-08 & traj_ploidy == 4 & N == 10000 & recomb=="1e-08" & s =="0.01") %>% ggplot(., aes(y=depth, fill = as.factor(sampGen), x = as.factor(sim_ploidy))) + geom_boxplot(notch=T) + ylab(expression(paste("Dip Breadth = Length of which ", pi[1], " < (",pi[2], " / 2)",sep = "")))+ scale_fill_discrete(name = "Trajectory\nPloidy") + xlab("Simulation Ploidy") + theme_bw()
## notch went outside hinges. Try setting notch=FALSE.

fact.ehh %>% filter(sim_ploidy=="sp8" & fuseGen=="fG1" & sampGen=="sG1" & recomb == "re1e-08" & dom=="do0.5" & N=="NN10000") %>% ggplot(.,aes(x=POSITION / 100000,y=xpehh.p,color=traj_ploidy)) + geom_smooth(method="loess", span=0.1, se = F) + scale_color_discrete(name="Trajectory\nPloidy", breaks=c("tp2","tp4","tp8"), labels=c("2","4","8")) + xlab("Position (100 kb)") + ylab(expression("-log"[10]*"(p-value)")) + theme_bw()
## Warning: Removed 12427 rows containing non-finite values (stat_smooth).